plotTheme <- theme(
plot.title = element_text(size = 14, face = "bold"),
plot.subtitle = element_text(size = 10),
plot.caption = element_text(size = 8),
axis.text.x = element_text(size = 10, angle = 45, hjust = 1),
axis.text.y = element_text(size = 10),
axis.title = element_text(size = 11, face = "bold"),
panel.background = element_blank(),
panel.grid.major = element_line(colour = "#D0D0D0", size = 0.2),
panel.grid.minor = element_blank(),
axis.ticks = element_blank(),
legend.position = "right"
)
mapTheme <- theme(
plot.title = element_text(size = 14, face = "bold"),
plot.subtitle = element_text(size = 10),
plot.caption = element_text(size = 8),
axis.line = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank(),
axis.title = element_blank(),
panel.background = element_blank(),
panel.border = element_blank(),
panel.grid.major = element_line(colour = 'transparent'),
panel.grid.minor = element_blank(),
legend.position = "right",
plot.margin = margin(1, 1, 1, 1, 'cm'),
legend.key.height = unit(1, "cm"),
legend.key.width = unit(0.2, "cm")
)
palette5 <- c("#eff3ff", "#bdd7e7", "#6baed6", "#3182bd", "#08519c")Assignment 5: Space-Time Prediction of Bike Share Demand
Part 1: Q4 (October - December) 2024 Data
I have chosen to examine Q4 data from 2024 (last year) as we are currently in Q4 of 2025 as of writing, and I believe last year’s patterns for this time of year would be best to inform predictions for Q4 2025.
# Read Q4 2024 data
indego <- read_csv("../data/indego-trips-2024-q4.csv")
# Quick look at the data
glimpse(indego)Rows: 299,121
Columns: 15
$ trip_id <dbl> 1050296434, 1050293063, 1050296229, 1050276817, 10…
$ duration <dbl> 22, 8, 12, 1, 5, 9, 5, 6, 16, 16, 16, 39, 6, 40, 1…
$ start_time <chr> "10/1/2024 0:00", "10/1/2024 0:06", "10/1/2024 0:0…
$ end_time <chr> "10/1/2024 0:22", "10/1/2024 0:14", "10/1/2024 0:1…
$ start_station <dbl> 3322, 3166, 3007, 3166, 3075, 3166, 3030, 3010, 33…
$ start_lat <dbl> 39.93638, 39.97195, 39.94517, 39.97195, 39.96718, …
$ start_lon <dbl> -75.15526, -75.13445, -75.15993, -75.13445, -75.16…
$ end_station <dbl> 3375, 3017, 3244, 3166, 3102, 3008, 3203, 3163, 33…
$ end_lat <dbl> 39.96036, 39.98003, 39.93865, 39.97195, 39.96759, …
$ end_lon <dbl> -75.14020, -75.14371, -75.16674, -75.13445, -75.17…
$ bike_id <chr> "03580", "05386", "18082", "02729", "18519", "0272…
$ plan_duration <dbl> 365, 365, 365, 1, 30, 1, 365, 365, 30, 30, 30, 30,…
$ trip_route_category <chr> "One Way", "One Way", "One Way", "Round Trip", "On…
$ passholder_type <chr> "Indego365", "Indego365", "Indego365", "Walk-up", …
$ bike_type <chr> "standard", "standard", "electric", "standard", "e…
Explore Data
# How many trips?
cat("Total trips in Q4 2024:", nrow(indego), "\n")Total trips in Q4 2024: 299121
# How many unique stations?
cat("Unique start stations:", length(unique(indego$start_station)), "\n")Unique start stations: 256
# Trip types
table(indego$trip_route_category)
One Way Round Trip
282675 16446
# Passholder types
table(indego$passholder_type)
Day Pass Indego30 Indego365 IndegoFlex NULL Walk-up
10711 159895 112690 1 1165 14659
# Bike types
table(indego$bike_type)
electric standard
175503 123618
Time Bins
indego <- indego %>%
mutate(
# Parse datetime
start_datetime = mdy_hm(start_time),
end_datetime = mdy_hm(end_time),
# Create hourly bins
interval60 = floor_date(start_datetime, unit = "hour"),
# Extract time features
week = week(interval60),
month = month(interval60, label = TRUE),
dotw = wday(interval60, label = TRUE),
hour = hour(interval60),
date = as.Date(interval60),
# Create useful indicators
weekend = ifelse(dotw %in% c("Sat", "Sun"), 1, 0),
rush_hour = ifelse(hour %in% c(7, 8, 9, 16, 17, 18), 1, 0)
)
# Look at temporal features
head(indego %>% select(start_datetime, interval60, week, dotw, hour, weekend))# A tibble: 6 × 6
start_datetime interval60 week dotw hour weekend
<dttm> <dttm> <dbl> <ord> <int> <dbl>
1 2024-10-01 00:00:00 2024-10-01 00:00:00 40 Tue 0 0
2 2024-10-01 00:06:00 2024-10-01 00:00:00 40 Tue 0 0
3 2024-10-01 00:06:00 2024-10-01 00:00:00 40 Tue 0 0
4 2024-10-01 00:06:00 2024-10-01 00:00:00 40 Tue 0 0
5 2024-10-01 00:07:00 2024-10-01 00:00:00 40 Tue 0 0
6 2024-10-01 00:08:00 2024-10-01 00:00:00 40 Tue 0 0
Trips Over Time
# Daily trip counts
daily_trips <- indego %>%
group_by(date) %>%
summarize(trips = n())
ggplot(daily_trips, aes(x = date, y = trips)) +
geom_line(color = "#3182bd", linewidth = 1) +
geom_smooth(se = FALSE, color = "red", linetype = "dashed") +
labs(
title = "Indego Daily Ridership - Q4 2024",
subtitle = "Fall demand patterns in Philadelphia",
x = "Date",
y = "Daily Trips",
caption = "Source: Indego bike share"
) +
plotThemeRidership falls during autumn to winter transition. There is also a particularly low ridership towards the end of November, perhaps November 28th, Thanksgiving 2024, as well as the end of December, likely December 25th, Christmas. We can confirm this intuition by examining those specific dates.
turkey_day <- daily_trips %>% filter(date == "2024-11-28")
xmas <- daily_trips %>% filter(date == "2024-12-25")
typical_boring_thurs <- indego %>%
filter(dotw == "Thu", date != "2024-11-28") %>%
group_by(date) %>%
summarize(trips = n()) %>%
summarize(avg_thurs_trips = mean(trips))
typical_boring_wed <- indego %>%
filter(dotw == "Wed", date != "2024-12-25") %>%
group_by(date) %>%
summarize(trips = n()) %>%
summarize(avg_wed_trips = mean(trips))
print(turkey_day)# A tibble: 1 × 2
date trips
<date> <int>
1 2024-11-28 604
print(typical_boring_thurs)# A tibble: 1 × 1
avg_thurs_trips
<dbl>
1 3634.
print(xmas)# A tibble: 1 × 2
date trips
<date> <int>
1 2024-12-25 495
print(typical_boring_wed)# A tibble: 1 × 1
avg_wed_trips
<dbl>
1 3845.
Hourly Patterns
# Average trips by hour and day type
hourly_patterns <- indego %>%
group_by(hour, weekend) %>%
summarize(avg_trips = n() / n_distinct(date)) %>%
mutate(day_type = ifelse(weekend == 1, "Weekend", "Weekday"))
ggplot(hourly_patterns, aes(x = hour, y = avg_trips, color = day_type)) +
geom_line(linewidth = 1.2) +
scale_color_manual(values = c("Weekday" = "#08519c", "Weekend" = "#6baed6")) +
labs(
title = "Average Hourly Ridership Patterns",
subtitle = "Clear commute patterns on weekdays",
x = "Hour of Day",
y = "Average Trips per Hour",
color = "Day Type"
) +
plotThemeWeekdays see two peaks during prime work commute hours, whereas weekends see a smooth curve of average hourly trips throughout daylight hours, peaking around mid afternoon.
Top Stations
# Most popular origin stations
top_stations <- indego %>%
count(start_station, start_lat, start_lon, name = "trips") %>%
arrange(desc(trips))
top_stations%>%head(20)%>%
kable(
caption = "Top 20 Indego Stations by Trip Origins",
format.args = list(big.mark = ",")) %>%
kable_styling(bootstrap_options = c("striped", "hover"))| start_station | start_lat | start_lon | trips |
|---|---|---|---|
| 3,010 | 39.94711 | -75.16618 | 5,943 |
| 3,032 | 39.94527 | -75.17971 | 4,471 |
| 3,359 | 39.94888 | -75.16978 | 3,923 |
| 3,244 | 39.93865 | -75.16674 | 3,492 |
| 3,295 | 39.95028 | -75.16027 | 3,411 |
| 3,020 | 39.94855 | -75.19007 | 3,369 |
| 3,208 | 39.95048 | -75.19324 | 3,343 |
| 3,066 | 39.94561 | -75.17348 | 3,342 |
| 3,054 | 39.96250 | -75.17420 | 3,297 |
| 3,101 | 39.94295 | -75.15955 | 3,214 |
| 3,022 | 39.95472 | -75.18323 | 3,152 |
| 3,028 | 39.94061 | -75.14958 | 3,101 |
| 3,362 | 39.94816 | -75.16226 | 3,072 |
| 3,063 | 39.94633 | -75.16980 | 2,972 |
| 3,185 | 39.95169 | -75.15888 | 2,952 |
| 3,059 | 39.96244 | -75.16121 | 2,926 |
| 3,012 | 39.94218 | -75.17747 | 2,907 |
| 3,061 | 39.95425 | -75.17761 | 2,847 |
| 3,161 | 39.95486 | -75.18091 | 2,837 |
| 3,256 | 39.95269 | -75.17779 | 2,816 |
Load Philadelphia Census Data
Map Philadelphia Context
# Map median income
ggplot() +
geom_sf(data = philly_census, aes(fill = Med_Inc), color = NA) +
scale_fill_viridis(
option = "viridis",
name = "Median\nIncome",
labels = scales::dollar
) +
labs(
title = "Philadelphia Median Household Income by Census Tract",
subtitle = "Context for understanding bike share demand patterns"
) +
# Stations
geom_point(
data = top_stations,
aes(x = start_lon, y = start_lat, color = trips),
size = 0.75, alpha = 0.6
) +
scale_color_gradientn(
colours = c("#FAFF89", "#D3321D"),
name = "Trips\nOriginating at Station"
) +
guides(
fill = guide_colorbar(),
color = guide_colorbar()
) +
theme(
legend.position = "bottom",
legend.box = "horizontal" # <-- puts them side by side
) +
mapThemeJoin Census Data to Stations
# Create sf object for stations
stations_sf <- indego %>%
distinct(start_station, start_lat, start_lon) %>%
filter(!is.na(start_lat), !is.na(start_lon)) %>%
st_as_sf(coords = c("start_lon", "start_lat"), crs = 4326)
# Spatial join to get census tract for each station
stations_census <- st_join(stations_sf, philly_census, left = TRUE) %>%
st_drop_geometry()
# Look at the result - investigate whether all of the stations joined to census data -- according to the map above there are stations in non-residential tracts.
stations_for_map <- indego %>%
distinct(start_station, start_lat, start_lon) %>%
filter(!is.na(start_lat), !is.na(start_lon)) %>%
left_join(
stations_census %>% select(start_station, Med_Inc),
by = "start_station"
) %>%
mutate(has_census = !is.na(Med_Inc))
summary(stations_for_map$has_census) Mode FALSE TRUE
logical 19 237
# Add back to trip data
indego_census <- indego %>%
left_join(
stations_census %>%
select(start_station, Med_Inc, Percent_Taking_Transit,
Percent_White, Total_Pop),
by = "start_station"
)
# Prepare data for visualization
stations_for_map <- indego %>%
distinct(start_station, start_lat, start_lon) %>%
filter(!is.na(start_lat), !is.na(start_lon)) %>%
left_join(
stations_census %>% select(start_station, Med_Inc),
by = "start_station"
) %>%
mutate(has_census = !is.na(Med_Inc))
# Create the map showing problem stations
ggplot() +
geom_sf(data = philly_census, aes(fill = Med_Inc), color = "white", size = 0.1) +
scale_fill_viridis(
option = "viridis",
name = "Median\nIncome",
labels = scales::dollar,
na.value = "grey90"
) +
# Stations with census data (small grey dots)
geom_point(
data = stations_for_map %>% filter(has_census),
aes(x = start_lon, y = start_lat),
color = "grey30", size = 1, alpha = 0.6
) +
# Stations WITHOUT census data (red X marks the spot)
geom_point(
data = stations_for_map %>% filter(!has_census),
aes(x = start_lon, y = start_lat),
color = "red", size = 1, shape = 4, stroke = 1.5
) +
labs(
title = "Philadelphia Median Household Income by Census Tract",
subtitle = "Indego stations shown (RED = no census data match)",
caption = "Red X marks indicate stations that didn't join to census tracts"
) +
mapTheme# Identify which stations to keep
valid_stations <- stations_census %>%
filter(!is.na(Med_Inc)) %>%
pull(start_station)
# Filter trip data to valid stations only
indego_census <- indego %>%
filter(start_station %in% valid_stations) %>%
left_join(
stations_census %>%
select(start_station, Med_Inc, Percent_Taking_Transit,
Percent_White, Total_Pop),
by = "start_station"
)Get Weather Data
# Get weather from Philadelphia International Airport (KPHL)
# This covers Q4 2024: October 1 - December 31
weather_data <- riem_measures(
station = "PHL", # Philadelphia International Airport
date_start = "2024-10-01",
date_end = "2024-12-31"
)
# Process weather data
weather_processed <- weather_data %>%
mutate(
interval60 = floor_date(valid, unit = "hour"),
Temperature = tmpf, # Temperature in Fahrenheit
Precipitation = ifelse(is.na(p01i), 0, p01i), # Hourly precip in inches
Wind_Speed = sknt # Wind speed in knots
) %>%
select(interval60, Temperature, Precipitation, Wind_Speed) %>%
distinct()
# Check for missing hours and interpolate if needed
weather_complete <- weather_processed %>%
complete(interval60 = seq(min(interval60), max(interval60), by = "hour")) %>%
fill(Temperature, Precipitation, Wind_Speed, .direction = "down")
# Look at the weather
summary(weather_complete %>% select(Temperature, Precipitation, Wind_Speed)) Temperature Precipitation Wind_Speed
Min. :12.00 Min. :0.000000 Min. : 0.000
1st Qu.:41.00 1st Qu.:0.000000 1st Qu.: 4.000
Median :51.00 Median :0.000000 Median : 7.000
Mean :50.80 Mean :0.004177 Mean : 7.663
3rd Qu.:60.95 3rd Qu.:0.000000 3rd Qu.:11.000
Max. :83.00 Max. :0.520000 Max. :30.000
Visualize Weather Patterns
ggplot(weather_complete, aes(x = interval60, y = Temperature)) +
geom_line(color = "#3182bd", alpha = 0.7) +
geom_smooth(se = FALSE, color = "red") +
labs(
title = "Philadelphia Temperature - Q1 2025",
subtitle = "Winter to early spring transition",
x = "Date",
y = "Temperature (°F)"
) +
plotThemeCreate Space-Time Panel: Aggregate Trips to Station-Hour Level
# Count trips by station-hour
trips_panel <- indego_census %>%
group_by(interval60, start_station, start_lat, start_lon,
Med_Inc, Percent_Taking_Transit, Percent_White, Total_Pop) %>%
summarize(Trip_Count = n()) %>%
ungroup()
# How many station-hour observations?
nrow(trips_panel)[1] 150972
# How many unique stations?
length(unique(trips_panel$start_station))[1] 237
# How many unique hours?
length(unique(trips_panel$interval60))[1] 2208
Create Complete Panel Structure
# Calculate expected panel size
n_stations <- length(unique(trips_panel$start_station))
n_hours <- length(unique(trips_panel$interval60))
expected_rows <- n_stations * n_hours
cat("Expected panel rows:", format(expected_rows, big.mark = ","), "\n")Expected panel rows: 523,296
cat("Current rows:", format(nrow(trips_panel), big.mark = ","), "\n")Current rows: 150,972
cat("Missing rows:", format(expected_rows - nrow(trips_panel), big.mark = ","), "\n")Missing rows: 372,324
# Create complete panel
study_panel <- expand.grid(
interval60 = unique(trips_panel$interval60),
start_station = unique(trips_panel$start_station)
) %>%
# Join trip counts
left_join(trips_panel, by = c("interval60", "start_station")) %>%
# Replace NA trip counts with 0
mutate(Trip_Count = replace_na(Trip_Count, 0))
# Fill in station attributes (they're the same for all hours)
station_attributes <- trips_panel %>%
group_by(start_station) %>%
summarize(
start_lat = first(start_lat),
start_lon = first(start_lon),
Med_Inc = first(Med_Inc),
Percent_Taking_Transit = first(Percent_Taking_Transit),
Percent_White = first(Percent_White),
Total_Pop = first(Total_Pop)
)
study_panel <- study_panel %>%
left_join(station_attributes, by = "start_station")
# Verify we have complete panel
cat("Complete panel rows:", format(nrow(study_panel), big.mark = ","), "\n")Complete panel rows: 523,296
Add Time Features
study_panel <- study_panel %>%
mutate(
week = week(interval60),
month = month(interval60, label = TRUE),
dotw = wday(interval60, label = TRUE),
hour = hour(interval60),
date = as.Date(interval60),
weekend = ifelse(dotw %in% c("Sat", "Sun"), 1, 0),
rush_hour = ifelse(hour %in% c(7, 8, 9, 16, 17, 18), 1, 0)
)Join Weather Data
study_panel <- study_panel %>%
left_join(weather_complete, by = "interval60")
# Check for missing values
summary(study_panel %>% select(Trip_Count, Temperature, Precipitation)) Trip_Count Temperature Precipitation
Min. : 0.0000 Min. :12.0 Min. :0.0000
1st Qu.: 0.0000 1st Qu.:41.0 1st Qu.:0.0000
Median : 0.0000 Median :51.0 Median :0.0000
Mean : 0.5178 Mean :50.8 Mean :0.0042
3rd Qu.: 1.0000 3rd Qu.:61.0 3rd Qu.:0.0000
Max. :28.0000 Max. :83.0 Max. :0.5200
NA's :5688 NA's :5688
Create Temporal Lag Variables
# Sort by station and time
study_panel <- study_panel %>%
arrange(start_station, interval60)
# Create lag variables WITHIN each station
study_panel <- study_panel %>%
group_by(start_station) %>%
mutate(
lag1Hour = lag(Trip_Count, 1),
lag2Hours = lag(Trip_Count, 2),
lag3Hours = lag(Trip_Count, 3),
lag12Hours = lag(Trip_Count, 12),
lag1day = lag(Trip_Count, 24)
) %>%
ungroup()
# Remove rows with NA lags (first 24 hours for each station)
study_panel_complete <- study_panel %>%
filter(!is.na(lag1day))
cat("Rows after removing NA lags:", format(nrow(study_panel_complete), big.mark = ","), "\n")Rows after removing NA lags: 605,298
Visualize Lag Correlations
# Sample one station to visualize
example_station <- study_panel_complete %>%
filter(start_station == 3212) %>%
head(168) # One week
# Plot actual vs lagged demand
ggplot(example_station, aes(x = interval60)) +
geom_line(aes(y = Trip_Count, color = "Current"), linewidth = 1) +
geom_line(aes(y = lag1Hour, color = "1 Hour Ago"), linewidth = 1, alpha = 0.7) +
geom_line(aes(y = lag1day, color = "24 Hours Ago"), linewidth = 1, alpha = 0.7) +
scale_color_manual(values = c(
"Current" = "#08519c",
"1 Hour Ago" = "#F09B4E",
"24 Hours Ago" = "#D3321D"
)) +
labs(
title = "Temporal Lag Patterns at One Station",
subtitle = "Past demand predicts future demand",
x = "Date-Time",
y = "Trip Count",
color = "Time Period"
) +
plotThemeTemporal Train/Test Split
Approach: Train on weeks 1-9, test on weeks 10-13 (predicting future from past)
# Split by week
# Q4 has weeks 40-53 (Oct-Dec)
# Train on weeks 40-49 (Oct 1 - early December)
# Test on weeks 50-53 (rest of December)
# Which stations have trips in BOTH early and late periods?
early_stations <- study_panel_complete %>%
filter(week < 50) %>%
filter(Trip_Count > 0) %>%
distinct(start_station) %>%
pull(start_station)
late_stations <- study_panel_complete %>%
filter(week >= 50) %>%
filter(Trip_Count > 0) %>%
distinct(start_station) %>%
pull(start_station)
# Keep only stations that appear in BOTH periods
common_stations <- intersect(early_stations, late_stations)
# Filter panel to only common stations
study_panel_complete <- study_panel_complete %>%
filter(start_station %in% common_stations)
# NOW create train/test split
train <- study_panel_complete %>%
filter(week < 50)
test <- study_panel_complete %>%
filter(week >= 50)
cat("Training observations:", format(nrow(train), big.mark = ","), "\n")Training observations: 410,628
cat("Testing observations:", format(nrow(test), big.mark = ","), "\n")Testing observations: 171,684
cat("Training date range:", min(train$date), "to", max(train$date), "\n")Training date range: 19997 to 20065
cat("Testing date range:", min(test$date), "to", max(test$date), "\n")Testing date range: 20066 to 20088
Model 1: Baseline (Time + Weather)
# Create day of week factor with treatment (dummy) coding
train <- train %>%
mutate(dotw_simple = factor(dotw, levels = c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")))
# Set contrasts to treatment coding (dummy variables)
contrasts(train$dotw_simple) <- contr.treatment(7)
# Now run the model
model1 <- lm(
Trip_Count ~ as.factor(hour) + dotw_simple + Temperature + Precipitation,
data = train
)
summary(model1)
Call:
lm(formula = Trip_Count ~ as.factor(hour) + dotw_simple + Temperature +
Precipitation, data = train)
Residuals:
Min 1Q Median 3Q Max
-1.7089 -0.6783 -0.2159 0.1882 27.2727
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.489156 0.013731 -35.624 < 0.0000000000000002 ***
as.factor(hour)1 -0.055267 0.012906 -4.282 0.0000184994686084 ***
as.factor(hour)2 -0.078435 0.012690 -6.181 0.0000000006378151 ***
as.factor(hour)3 -0.110234 0.012655 -8.711 < 0.0000000000000002 ***
as.factor(hour)4 -0.074204 0.012566 -5.905 0.0000000035263476 ***
as.factor(hour)5 0.038930 0.012623 3.084 0.002042 **
as.factor(hour)6 0.296063 0.012639 23.424 < 0.0000000000000002 ***
as.factor(hour)7 0.562184 0.012853 43.738 < 0.0000000000000002 ***
as.factor(hour)8 0.940524 0.012603 74.629 < 0.0000000000000002 ***
as.factor(hour)9 0.681527 0.012691 53.700 < 0.0000000000000002 ***
as.factor(hour)10 0.562441 0.012526 44.900 < 0.0000000000000002 ***
as.factor(hour)11 0.602993 0.012538 48.094 < 0.0000000000000002 ***
as.factor(hour)12 0.698148 0.012435 56.145 < 0.0000000000000002 ***
as.factor(hour)13 0.667308 0.012338 54.085 < 0.0000000000000002 ***
as.factor(hour)14 0.685610 0.012386 55.355 < 0.0000000000000002 ***
as.factor(hour)15 0.791576 0.012780 61.940 < 0.0000000000000002 ***
as.factor(hour)16 0.906925 0.012549 72.273 < 0.0000000000000002 ***
as.factor(hour)17 1.110205 0.012639 87.842 < 0.0000000000000002 ***
as.factor(hour)18 0.831297 0.012728 65.314 < 0.0000000000000002 ***
as.factor(hour)19 0.541516 0.012768 42.412 < 0.0000000000000002 ***
as.factor(hour)20 0.341348 0.012858 26.547 < 0.0000000000000002 ***
as.factor(hour)21 0.233603 0.012881 18.135 < 0.0000000000000002 ***
as.factor(hour)22 0.175194 0.012820 13.666 < 0.0000000000000002 ***
as.factor(hour)23 0.076033 0.012906 5.891 0.0000000038337890 ***
dotw_simple2 0.065588 0.006869 9.549 < 0.0000000000000002 ***
dotw_simple3 0.062920 0.006857 9.177 < 0.0000000000000002 ***
dotw_simple4 -0.050061 0.006571 -7.619 0.0000000000000257 ***
dotw_simple5 -0.012223 0.006833 -1.789 0.073612 .
dotw_simple6 -0.024444 0.006806 -3.591 0.000329 ***
dotw_simple7 -0.050983 0.006924 -7.363 0.0000000000001797 ***
Temperature 0.012467 0.000161 77.432 < 0.0000000000000002 ***
Precipitation -1.109233 0.081793 -13.562 < 0.0000000000000002 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.144 on 410596 degrees of freedom
Multiple R-squared: 0.1141, Adjusted R-squared: 0.114
F-statistic: 1705 on 31 and 410596 DF, p-value: < 0.00000000000000022
Patterns:
- Tuesday and Wednesday have positive coefficients (0.066 and 0.063)
- Thursday through Sunday have negative coefficients
- Tuesday has the highest weekday effect (+0.066)
- This might be a reflection of concentrated commuting patterns at the beginning of the week, and towards the end of the work week there may be more flexible hyrib or work from jobs influencing fewer trips than a Monday.
Hourly Patterns
Hour Coefficient Interpretation 0 (baseline) 0.000 trips/hour (midnight) 1 -0.055 slightly fewer than midnight … 5 +0.039 morning activity starting … 8 +0.941 PEAK morning rush … 10 +0.562 post-rush … 17 +1.110 PEAK evening rush … 23 +0.076 late night minimal
Model 2: Add Temporal Lags
model2 <- lm(
Trip_Count ~ as.factor(hour) + dotw_simple + Temperature + Precipitation +
lag1Hour + lag3Hours + lag1day,
data = train
)
summary(model2)
Call:
lm(formula = Trip_Count ~ as.factor(hour) + dotw_simple + Temperature +
Precipitation + lag1Hour + lag3Hours + lag1day, data = train)
Residuals:
Min 1Q Median 3Q Max
-12.8428 -0.4697 -0.1265 0.1225 25.5313
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.1971724 0.0120171 -16.408 < 0.0000000000000002 ***
as.factor(hour)1 -0.0175731 0.0112639 -1.560 0.11873
as.factor(hour)2 -0.0093400 0.0110785 -0.843 0.39918
as.factor(hour)3 -0.0323163 0.0110523 -2.924 0.00346 **
as.factor(hour)4 -0.0064036 0.0109793 -0.583 0.55973
as.factor(hour)5 0.0625618 0.0110386 5.668 0.00000001449553616 ***
as.factor(hour)6 0.2581886 0.0110639 23.336 < 0.0000000000000002 ***
as.factor(hour)7 0.4087297 0.0112659 36.280 < 0.0000000000000002 ***
as.factor(hour)8 0.6475387 0.0110725 58.482 < 0.0000000000000002 ***
as.factor(hour)9 0.2687374 0.0111533 24.095 < 0.0000000000000002 ***
as.factor(hour)10 0.2122365 0.0109778 19.333 < 0.0000000000000002 ***
as.factor(hour)11 0.2554385 0.0109944 23.234 < 0.0000000000000002 ***
as.factor(hour)12 0.3583675 0.0108988 32.881 < 0.0000000000000002 ***
as.factor(hour)13 0.3239304 0.0108163 29.948 < 0.0000000000000002 ***
as.factor(hour)14 0.3531805 0.0108536 32.540 < 0.0000000000000002 ***
as.factor(hour)15 0.4322898 0.0112024 38.589 < 0.0000000000000002 ***
as.factor(hour)16 0.5127136 0.0110123 46.558 < 0.0000000000000002 ***
as.factor(hour)17 0.6404994 0.0111130 57.635 < 0.0000000000000002 ***
as.factor(hour)18 0.3266069 0.0112005 29.160 < 0.0000000000000002 ***
as.factor(hour)19 0.1612724 0.0111995 14.400 < 0.0000000000000002 ***
as.factor(hour)20 0.0449273 0.0112804 3.983 0.00006812693842296 ***
as.factor(hour)21 0.0488508 0.0112677 4.335 0.00001454699054146 ***
as.factor(hour)22 0.0606778 0.0111954 5.420 0.00000005967221309 ***
as.factor(hour)23 0.0183651 0.0112646 1.630 0.10303
dotw_simple2 0.0070021 0.0059975 1.167 0.24301
dotw_simple3 -0.0128484 0.0059925 -2.144 0.03203 *
dotw_simple4 -0.0448243 0.0057357 -7.815 0.00000000000000551 ***
dotw_simple5 -0.0422563 0.0059684 -7.080 0.00000000000144379 ***
dotw_simple6 -0.0367596 0.0059417 -6.187 0.00000000061504630 ***
dotw_simple7 -0.0593011 0.0060464 -9.808 < 0.0000000000000002 ***
Temperature 0.0040641 0.0001426 28.494 < 0.0000000000000002 ***
Precipitation -0.9676346 0.0714492 -13.543 < 0.0000000000000002 ***
lag1Hour 0.3238000 0.0014825 218.412 < 0.0000000000000002 ***
lag3Hours 0.1180469 0.0014504 81.387 < 0.0000000000000002 ***
lag1day 0.2027487 0.0013895 145.912 < 0.0000000000000002 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.9988 on 410593 degrees of freedom
Multiple R-squared: 0.3252, Adjusted R-squared: 0.3252
F-statistic: 5821 on 34 and 410593 DF, p-value: < 0.00000000000000022
Adding lags improve R² by about 20 percentage points. This is likely because the state of the Indego station an hour or a few hours before affects how many trips can be taken hours later, and the one day lag likely accounts for the peaks and ebs at the same time a day before, so it makes sense that this model can account for more of the variance in the data.
Model 3: Add Demographics
model3 <- lm(
Trip_Count ~ as.factor(hour) + dotw_simple + Temperature + Precipitation +
lag1Hour + lag3Hours + lag1day +
Med_Inc.x + Percent_Taking_Transit.y + Percent_White.y,
data = train
)
summary(model3)
Call:
lm(formula = Trip_Count ~ as.factor(hour) + dotw_simple + Temperature +
Precipitation + lag1Hour + lag3Hours + lag1day + Med_Inc.x +
Percent_Taking_Transit.y + Percent_White.y, data = train)
Residuals:
Min 1Q Median 3Q Max
-10.2603 -0.7198 -0.2719 0.4456 24.6278
Coefficients:
Estimate Std. Error t value
(Intercept) 0.7009115282 0.0386608126 18.130
as.factor(hour)1 -0.0339064631 0.0443755122 -0.764
as.factor(hour)2 -0.0733390073 0.0472207425 -1.553
as.factor(hour)3 -0.1736198797 0.0563179188 -3.083
as.factor(hour)4 -0.1602016516 0.0526556216 -3.042
as.factor(hour)5 -0.0842198175 0.0392042217 -2.148
as.factor(hour)6 0.1812725910 0.0339389718 5.341
as.factor(hour)7 0.3659932453 0.0327798040 11.165
as.factor(hour)8 0.6419351722 0.0315874296 20.322
as.factor(hour)9 0.0689540443 0.0318174207 2.167
as.factor(hour)10 0.0455849373 0.0318601406 1.431
as.factor(hour)11 0.0951277607 0.0317831322 2.993
as.factor(hour)12 0.1718233329 0.0312986651 5.490
as.factor(hour)13 0.1501299648 0.0313008986 4.796
as.factor(hour)14 0.1546199385 0.0311814820 4.959
as.factor(hour)15 0.2738677333 0.0314129060 8.718
as.factor(hour)16 0.3825663042 0.0310863390 12.307
as.factor(hour)17 0.5958195674 0.0311303049 19.140
as.factor(hour)18 0.1871034280 0.0314626011 5.947
as.factor(hour)19 0.0105635560 0.0320040115 0.330
as.factor(hour)20 -0.0963042374 0.0327876103 -2.937
as.factor(hour)21 -0.0718694380 0.0336061149 -2.139
as.factor(hour)22 -0.0356940800 0.0343008260 -1.041
as.factor(hour)23 -0.0793554621 0.0360569574 -2.201
dotw_simple2 0.0015399442 0.0129720488 0.119
dotw_simple3 -0.0058302798 0.0131356919 -0.444
dotw_simple4 -0.0584003672 0.0128820191 -4.533
dotw_simple5 -0.0902616975 0.0132777688 -6.798
dotw_simple6 -0.0296928615 0.0133240066 -2.229
dotw_simple7 -0.0404284310 0.0137730955 -2.935
Temperature 0.0060129110 0.0003255823 18.468
Precipitation -2.6993476761 0.2537854011 -10.636
lag1Hour 0.2432506608 0.0023823455 102.106
lag3Hours 0.0728128784 0.0024806873 29.352
lag1day 0.1565602280 0.0022935086 68.262
Med_Inc.x -0.0000001894 0.0000001237 -1.531
Percent_Taking_Transit.y -0.0038935558 0.0004528281 -8.598
Percent_White.y 0.0037864819 0.0002282348 16.590
Pr(>|t|)
(Intercept) < 0.0000000000000002 ***
as.factor(hour)1 0.44482
as.factor(hour)2 0.12040
as.factor(hour)3 0.00205 **
as.factor(hour)4 0.00235 **
as.factor(hour)5 0.03170 *
as.factor(hour)6 0.0000000925173 ***
as.factor(hour)7 < 0.0000000000000002 ***
as.factor(hour)8 < 0.0000000000000002 ***
as.factor(hour)9 0.03022 *
as.factor(hour)10 0.15249
as.factor(hour)11 0.00276 **
as.factor(hour)12 0.0000000403123 ***
as.factor(hour)13 0.0000016175893 ***
as.factor(hour)14 0.0000007104928 ***
as.factor(hour)15 < 0.0000000000000002 ***
as.factor(hour)16 < 0.0000000000000002 ***
as.factor(hour)17 < 0.0000000000000002 ***
as.factor(hour)18 0.0000000027402 ***
as.factor(hour)19 0.74135
as.factor(hour)20 0.00331 **
as.factor(hour)21 0.03247 *
as.factor(hour)22 0.29805
as.factor(hour)23 0.02775 *
dotw_simple2 0.90550
dotw_simple3 0.65715
dotw_simple4 0.0000058070122 ***
dotw_simple5 0.0000000000107 ***
dotw_simple6 0.02585 *
dotw_simple7 0.00333 **
Temperature < 0.0000000000000002 ***
Precipitation < 0.0000000000000002 ***
lag1Hour < 0.0000000000000002 ***
lag3Hours < 0.0000000000000002 ***
lag1day < 0.0000000000000002 ***
Med_Inc.x 0.12579
Percent_Taking_Transit.y < 0.0000000000000002 ***
Percent_White.y < 0.0000000000000002 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.274 on 133822 degrees of freedom
(276768 observations deleted due to missingness)
Multiple R-squared: 0.2192, Adjusted R-squared: 0.219
F-statistic: 1015 on 37 and 133822 DF, p-value: < 0.00000000000000022
Model 4: Add Station Fixed Effects
model4 <- lm(
Trip_Count ~ as.factor(hour) + dotw_simple + Temperature + Precipitation +
lag1Hour + lag3Hours + lag1day +
Med_Inc.x + Percent_Taking_Transit.y + Percent_White.y +
as.factor(start_station),
data = train
)
# Summary too long with all station dummies, just show key metrics
cat("Model 4 R-squared:", summary(model4)$r.squared, "\n")Model 4 R-squared: 0.2469249
cat("Model 4 Adj R-squared:", summary(model4)$adj.r.squared, "\n")Model 4 Adj R-squared: 0.2454536
Model 5: Add Rush Hour Interaction
model5 <- lm(
Trip_Count ~ as.factor(hour) + dotw_simple + Temperature + Precipitation +
lag1Hour + lag3Hours + lag1day + rush_hour + as.factor(month) +
Med_Inc.x + Percent_Taking_Transit.y + Percent_White.y +
as.factor(start_station) +
rush_hour * weekend, # Rush hour effects different on weekends
data = train
)
cat("Model 5 R-squared:", summary(model5)$r.squared, "\n")Model 5 R-squared: 0.2526638
cat("Model 5 Adj R-squared:", summary(model5)$adj.r.squared, "\n")Model 5 Adj R-squared: 0.251187
Model Evaluation: Calculate Predictions and MAE
# Get predictions on test set
# Create day of week factor with treatment (dummy) coding
test <- test %>%
mutate(dotw_simple = factor(dotw, levels = c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")))
# Set contrasts to treatment coding (dummy variables)
contrasts(test$dotw_simple) <- contr.treatment(7)
test <- test %>%
mutate(
pred1 = predict(model1, newdata = test),
pred2 = predict(model2, newdata = test),
pred3 = predict(model3, newdata = test),
pred4 = predict(model4, newdata = test),
pred5 = predict(model5, newdata = test)
)
# Calculate MAE for each model
mae_results <- data.frame(
Model = c(
"1. Time + Weather",
"2. + Temporal Lags",
"3. + Demographics",
"4. + Station FE",
"5. + Rush Hour Interaction"
),
MAE = c(
mean(abs(test$Trip_Count - test$pred1), na.rm = TRUE),
mean(abs(test$Trip_Count - test$pred2), na.rm = TRUE),
mean(abs(test$Trip_Count - test$pred3), na.rm = TRUE),
mean(abs(test$Trip_Count - test$pred4), na.rm = TRUE),
mean(abs(test$Trip_Count - test$pred5), na.rm = TRUE)
)
)
kable(mae_results,
digits = 2,
caption = "Mean Absolute Error by Model (Test Set)",
col.names = c("Model", "MAE (trips)")) %>%
kable_styling(bootstrap_options = c("striped", "hover"))| Model | MAE (trips) |
|---|---|
| 1. Time + Weather | 0.54 |
| 2. + Temporal Lags | 0.40 |
| 3. + Demographics | 0.64 |
| 4. + Station FE | 0.66 |
| 5. + Rush Hour Interaction | 0.65 |
Temporal Lags improved the model the most; the biggest drop in mean absolute error was from the model accounting for time of day, day of week, and weather, and then adding in the temporal lags. In fact, adding demographics made the model perform worse than the baseline model, as well as adding station fixed effects and interaction effects on top of that.
Compared to Q1 2025:
The MAE values for Q4 2024 are fairly similar across the models. I believe this might be because both quarters account for season changes into winter (Q4) and out of winter (Q1) that make it harder to predict bike share in the latter part of the quarter based on the first, even when taking into account weather effects. There are also a high concentration of winter holidays that likely affect bike travel in Q4, such as Thanksgiving and Winter Break holidays, and in Q1 New Years, Valentine’s Day, and Spring Break holidays.
Part 2: Error Analysis
Observed vs. Predicted Error Analysis
test <- test %>%
mutate(
error = Trip_Count - pred2,
abs_error = abs(error),
time_of_day = case_when(
hour < 7 ~ "Overnight",
hour >= 7 & hour < 10 ~ "AM Rush",
hour >= 10 & hour < 15 ~ "Mid-Day",
hour >= 15 & hour <= 18 ~ "PM Rush",
hour > 18 ~ "Evening"
)
)
# Scatter plot by time and day type
ggplot(test, aes(x = Trip_Count, y = pred2)) +
geom_point(alpha = 0.2, color = "#3182bd") +
geom_abline(slope = 1, intercept = 0, color = "red", linewidth = 1) +
geom_smooth(method = "lm", se = FALSE, color = "darkgreen") +
facet_grid(weekend ~ time_of_day) +
labs(
title = "Observed vs. Predicted Bike Trips",
subtitle = "Model 2 performance by time period",
x = "Observed Trips",
y = "Predicted Trips",
caption = "Red line = perfect predictions; Green line = actual model fit"
) +
plotThemeThe model consistently under predicts trips on weekdays, weekends, at all time periods of the day. The one period that appears to be moderately better predicted is the AM Rush period of weekdays, and generally slightly better at predicting weekdays than weekends. However, the consistent under-predicting seems to signify there is a major driver of Indego trips that this model does not account for well.
Spatial Error Patterns
# Calculate MAE by station
station_errors <- test %>%
group_by(start_station, start_lat.x, start_lon.y) %>%
summarize(
MAE = mean(abs_error, na.rm = TRUE),
avg_demand = mean(Trip_Count, na.rm = TRUE),
.groups = "drop"
) %>%
filter(!is.na(start_lat.x), !is.na(start_lon.y))
## Create Two Maps Side-by-Side with Proper Legends (sorry these maps are ugly)
# Calculate station errors
station_errors <- test %>%
filter(!is.na(pred2)) %>%
group_by(start_station, start_lat.x, start_lon.y) %>%
summarize(
MAE = mean(abs(Trip_Count - pred2), na.rm = TRUE),
avg_demand = mean(Trip_Count, na.rm = TRUE),
.groups = "drop"
) %>%
filter(!is.na(start_lat.x), !is.na(start_lon.y))
# Map 1: Prediction Errors
p1 <- ggplot() +
geom_sf(data = philly_census, fill = "grey95", color = "white", size = 0.2) +
geom_point(
data = station_errors,
aes(x = start_lon.y, y = start_lat.x, color = MAE),
size = 3.5,
alpha = 0.7
) +
scale_color_viridis(
option = "plasma",
name = "MAE\n(trips)",
direction = -1,
breaks = c(0.5, 1.0, 1.5), # Fewer, cleaner breaks
labels = c("0.5", "1.0", "1.5")
) +
labs(title = "Prediction Errors",
subtitle = "Higher in Center City") +
mapTheme +
theme(
legend.position = "right",
legend.title = element_text(size = 10, face = "bold"),
legend.text = element_text(size = 9),
plot.title = element_text(size = 14, face = "bold"),
plot.subtitle = element_text(size = 10)
) +
guides(color = guide_colorbar(
barwidth = 1.5,
barheight = 12,
title.position = "top",
title.hjust = 0.5
))
# Map 2: Average Demand
p2 <- ggplot() +
geom_sf(data = philly_census, fill = "grey95", color = "white", size = 0.2) +
geom_point(
data = station_errors,
aes(x = start_lon.y, y = start_lat.x, color = avg_demand),
size = 3.5,
alpha = 0.7
) +
scale_color_viridis(
option = "viridis",
name = "Avg\nDemand",
direction = -1,
breaks = c(0.5, 1.0, 1.5, 2.0, 2.5), # Clear breaks
labels = c("0.5", "1.0", "1.5", "2.0", "2.5")
) +
labs(title = "Average Demand",
subtitle = "Trips per station-hour") +
mapTheme +
theme(
legend.position = "right",
legend.title = element_text(size = 10, face = "bold"),
legend.text = element_text(size = 9),
plot.title = element_text(size = 14, face = "bold"),
plot.subtitle = element_text(size = 10)
) +
guides(color = guide_colorbar(
barwidth = 1.5,
barheight = 12,
title.position = "top",
title.hjust = 0.5
))
p1 | p2Higher MAEs are clustered around center city, where trips per hour are also higher on average. This nuances our understanding of the model’s under-performance overall, as we can see that MAEs are not equal across all stations but higher error gaps where there are higher trips. This indicates that there are spatial variables that this model lacks.
Temporal Error Patterns
# MAE by time of day and day type
temporal_errors <- test %>%
group_by(time_of_day, weekend) %>%
summarize(
MAE = mean(abs_error, na.rm = TRUE),
.groups = "drop"
) %>%
mutate(day_type = ifelse(weekend == 1, "Weekend", "Weekday"))
ggplot(temporal_errors, aes(x = time_of_day, y = MAE, fill = day_type)) +
geom_col(position = "dodge") +
scale_fill_manual(values = c("Weekday" = "#08519c", "Weekend" = "#6baed6")) +
labs(
title = "Prediction Errors by Time Period",
subtitle = "When is the model struggling most?",
x = "Time of Day",
y = "Mean Absolute Error (trips)",
fill = "Day Type"
) +
plotTheme +
theme(axis.text.x = element_text(angle = 45, hjust = 1))MAEs are higher for weekdays than weekends on average, but combined with the previous plots of observed vs predicted values, this shows that the mean absolute errors partially higher for weekdays because of higher trip counts on weekdays typically than weekends, and higher trip counts during that AM and PM rush. In other words, this again confirms that the model is creating predictions that are too uniform accross time and space.
Errors and Demographics
# Join demographic data to station errors
station_errors_demo <- station_errors %>%
left_join(
station_attributes %>% select(start_station, Med_Inc, Percent_Taking_Transit, Percent_White),
by = "start_station"
) %>%
filter(!is.na(Med_Inc))
# Create plots
inc_plot <- ggplot(station_errors_demo, aes(x = Med_Inc, y = MAE)) +
geom_point(alpha = 0.5, color = "#3182bd") +
geom_smooth(method = "lm", se = FALSE, color = "red") +
scale_x_continuous(labels = scales::dollar) +
labs(title = "Errors vs. Median Income", x = "Median Income", y = "MAE") +
plotTheme
transit_plot <- ggplot(station_errors_demo, aes(x = Percent_Taking_Transit, y = MAE)) +
geom_point(alpha = 0.5, color = "#3182bd") +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(title = "Errors vs. Transit Usage", x = "% Taking Transit", y = "MAE") +
plotTheme
white_plot <- ggplot(station_errors_demo, aes(x = Percent_White, y = MAE)) +
geom_point(alpha = 0.5, color = "#3182bd") +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(title = "Errors vs. Race", x = "% White", y = "MAE") +
plotTheme
grid.arrange(inc_plot, transit_plot, white_plot, ncol = 2)Prediction errors are higher for higher income and more White areas, but lower for higher transit-taking areas. This again tracks with the general under-performance of the model, as percent White and median income are slightly positively correlated with trips and percent using transit is slightly negatively correlated with trips.
Part 3: Feature Engineering & model improvement
After observing the error results from the first five models, I believe the biggest aspects the models are missing are spatial variables that influence bike share usage. Seeing as the under-counting is clustered around center city, I think some likely attributes of center city that make biking more attractive is the relative nearness of start and end destinations to each other (~80% of trips are under 1.5 miles). In other words, I think the model would be a better predictor if it took into account the potential rider’s available close-by destination bike stations they could park the bike within the average trip length that riders take. Additionally, I think people are more likely to feel comfortable biking when there are bikes lanes nearby or roads with a low posted speed limit. I also want to build off the success of adding the spatial lags that improved model 1 to model 2 so much, and add another lag for the same hour in the previous week.
# Step 1: Build LINESTRING geometry for each row
trips_sf <- indego %>%
filter(
!is.na(start_lat),
!is.na(start_lon),
!is.na(end_lat),
!is.na(end_lon)
)%>%
rowwise() %>%
mutate(
geometry = list(
st_linestring(
matrix(
c(start_lon, start_lat,
end_lon, end_lat),
ncol = 2,
byrow = TRUE
)
)
)
) %>%
ungroup() %>%
st_as_sf(crs = 4326)
# Step 2: Transform to PA South (meters)
trips_sf <- st_transform(trips_sf, 6539)
# Step 3: Calculate trip distance (meters)
trips_sf <- trips_sf %>%
mutate(distance_f = st_length(geometry))
trips_sf <- trips_sf %>%
mutate(
distance_mi = as.numeric(distance_f) / 5280 # 1 mile = 5280 ft
)
summary(trips_sf$distance_mi) Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0000 0.5484 0.9063 1.0439 1.3783 8.8305
Spatial features:
- Number of low-speed roads near station
- Number and type of Bike lanes near station
- Nearness of station to other stations
roads_sf <- st_read("../data/RMSADMIN_(Administrative_Classifications_of_Roadway)/RMSADMIN_(Administrative_Classifications_of_Roadway).shp")Reading layer `RMSADMIN_(Administrative_Classifications_of_Roadway)' from data source `/Users/kknox/Documents/GitHub/portfolio-setup-kkxix/labs/lab5/data/RMSADMIN_(Administrative_Classifications_of_Roadway)/RMSADMIN_(Administrative_Classifications_of_Roadway).shp'
using driver `ESRI Shapefile'
Simple feature collection with 240887 features and 34 fields (with 1914 geometries empty)
Geometry type: MULTILINESTRING
Dimension: XY
Bounding box: xmin: -8963377 ymin: 4825317 xmax: -8314805 ymax: 5201143
Projected CRS: WGS 84 / Pseudo-Mercator
blanes_sf <- st_read("../data/Bike_Network/Bike_Network.shp")Reading layer `Bike_Network' from data source
`/Users/kknox/Documents/GitHub/portfolio-setup-kkxix/labs/lab5/data/Bike_Network/Bike_Network.shp'
using driver `ESRI Shapefile'
Simple feature collection with 5225 features and 8 fields
Geometry type: LINESTRING
Dimension: XY
Bounding box: xmin: -8378937 ymin: 4847835 xmax: -8345146 ymax: 4883978
Projected CRS: WGS 84 / Pseudo-Mercator
# filter to just roadways in Phildelphia County under 25 mph
# Phildelphia CTY CODE is 67
roads_sf<-roads_sf%>%filter(
CTY_CODE == 67,
SPEED_LIMI<=25
)
roads_sf<-roads_sf%>%
st_transform(st_crs(stations_sf))%>%
mutate(label="Road <25mph")
blanes_sf<-blanes_sf%>%
st_transform(st_crs(stations_sf))%>%
mutate(label="Bike Lane")
stations_sf<-stations_sf%>%mutate(
label="Indego Station"
)
# map with stations
ggplot() +
geom_sf(data = roads_sf, aes(color = label), linewidth = 0.1) +
geom_sf(data = blanes_sf, aes(color = label), linewidth = 0.1) +
geom_sf(data = stations_sf, aes(color = label), size = 0.7, alpha = 0.7) +
scale_color_manual(
name = "Legend",
values = c(
"Road <25mph" = "#bdd7e7",
"Bike Lane" = "#F09B4E",
"Indego Station" = "#08519c"
)
) +
labs(title = "Indego Stations, Bike Lanes, and <25mph Roads") +
mapTheme +
theme(
legend.position = "right",
legend.title = element_text(size = 10, face = "bold"),
legend.text = element_text(size = 9)
)stations_buffered <- stations_sf %>%
mutate(buffer_geom = st_buffer(geometry, dist = 500))
# ---- Count roads intersecting each buffer ----
road_counts <- st_intersects(stations_buffered$buffer_geom, roads_sf) %>%
lengths()
# ---- Count bike lanes intersecting each buffer ----
blane_counts <- st_intersects(stations_buffered$buffer_geom, blanes_sf) %>%
lengths()
# ---- Add results back onto stations_sf ----
stations_sf <- stations_sf %>%
mutate(
nearby_roads = road_counts,
nearby_bikelanes = blane_counts
)
ggplot() +
geom_sf(data = roads_sf, color="lightgray", linewidth = 0.1) +
geom_sf(data = stations_sf, aes(color = nearby_bikelanes), size = 0.7, alpha = 0.7) +
labs(title = "Stations Colored by Number of Nearby Bike Lanes") +
mapTheme +
theme(
legend.position = "right",
legend.title = element_text(size = 10, face = "bold"),
legend.text = element_text(size = 9)
)In addition to nearby bike lanes and slow roads, I will add the number of other bike stations within a buffer of the median trip distance of 0.9063 miles.
# 0.9063 miles = 1458.54847 meters
stations_buffered <- stations_sf %>%
mutate(buffer_geom = st_buffer(geometry, dist = 1458.54847))
# ---- Count indego stations intersecting each buffer ----
station_counts <- st_intersects(stations_buffered$buffer_geom, stations_sf) %>%
lengths()
# ---- Add results back onto stations_sf ----
stations_sf <- stations_sf %>%
mutate(
nearby_stations = station_counts
)
ggplot() +
geom_sf(data = roads_sf, color="lightgray", linewidth = 0.1) +
geom_sf(data = stations_sf, aes(color = nearby_stations), size = 0.7, alpha = 0.7) +
labs(title = "Stations Colored by Number of Nearby Other Stations") +
mapTheme +
theme(
legend.position = "right",
legend.title = element_text(size = 10, face = "bold"),
legend.text = element_text(size = 9)
)# Add back to trip data
study_panel <- study_panel %>%
left_join(
stations_sf %>%
select(start_station, nearby_roads, nearby_bikelanes, nearby_stations),
by = "start_station"
)Trip history features:
- Add lag for same hour last week
study_panel <- study_panel %>%
group_by(start_station) %>%
mutate(
lag1week = lag(Trip_Count, 168)
) %>%
ungroup()
# Remove rows with NA lags (first 24 hours for each station)
study_panel_complete <- study_panel %>%
filter(!is.na(lag1week) & !is.na(lag1day))
cat("Rows after removing NA lags:", format(nrow(study_panel_complete), big.mark = ","), "\n")Rows after removing NA lags: 571,170
Model 6: Add nearby spatial features and 1-week lag
study_panel_complete <- study_panel_complete %>%
filter(start_station %in% common_stations)
# NOW create train/test split
train <- study_panel_complete %>%
filter(week < 50)
test <- study_panel_complete %>%
filter(week >= 50)
train <- train %>%
mutate(dotw_simple = factor(dotw, levels = c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")))
# Set contrasts to treatment coding (dummy variables)
contrasts(train$dotw_simple) <- contr.treatment(7)
test <- test %>%
mutate(dotw_simple = factor(dotw, levels = c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")))
# Set contrasts to treatment coding (dummy variables)
contrasts(test$dotw_simple) <- contr.treatment(7)
cat("Training observations:", format(nrow(train), big.mark = ","), "\n")Training observations: 377,796
cat("Testing observations:", format(nrow(test), big.mark = ","), "\n")Testing observations: 171,684
cat("Training date range:", min(train$date), "to", max(train$date), "\n")Training date range: 20003 to 20065
cat("Testing date range:", min(test$date), "to", max(test$date), "\n")Testing date range: 20066 to 20088
model6 <- lm(
Trip_Count ~ as.factor(hour) + dotw_simple + Temperature + Precipitation +
lag1Hour + lag3Hours + lag1day + rush_hour + as.factor(month) +
Med_Inc.x + Percent_Taking_Transit.y + Percent_White.y +
as.factor(start_station) +
rush_hour * weekend + # Rush hour effects different on weekends
nearby_roads + nearby_bikelanes + nearby_stations +
lag1week,
data = train
)
cat("Model 6 R-squared:", summary(model6)$r.squared, "\n")Model 6 R-squared: 0.2569648
cat("Model 6 Adj R-squared:", summary(model6)$adj.r.squared, "\n")Model 6 Adj R-squared: 0.2553348
model7 <- lm(
Trip_Count ~ as.factor(hour) + dotw_simple + Temperature + Precipitation +
lag1Hour + lag3Hours + lag1day + lag1week + as.factor(month) +week+
nearby_roads + nearby_bikelanes + nearby_stations,
data = train
)
summary(model7)
Call:
lm(formula = Trip_Count ~ as.factor(hour) + dotw_simple + Temperature +
Precipitation + lag1Hour + lag3Hours + lag1day + lag1week +
as.factor(month) + week + nearby_roads + nearby_bikelanes +
nearby_stations, data = train)
Residuals:
Min 1Q Median 3Q Max
-10.9185 -0.4683 -0.1218 0.1939 25.6004
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.43496916 0.08269874 5.260 0.00000014438239072 ***
as.factor(hour)1 -0.02757267 0.01140429 -2.418 0.015617 *
as.factor(hour)2 -0.02471651 0.01120616 -2.206 0.027411 *
as.factor(hour)3 -0.04223159 0.01122202 -3.763 0.000168 ***
as.factor(hour)4 -0.02301352 0.01110560 -2.072 0.038243 *
as.factor(hour)5 0.04958198 0.01121689 4.420 0.00000985935730785 ***
as.factor(hour)6 0.23921149 0.01125699 21.250 < 0.0000000000000002 ***
as.factor(hour)7 0.39229409 0.01149508 34.127 < 0.0000000000000002 ***
as.factor(hour)8 0.65535940 0.01132090 57.889 < 0.0000000000000002 ***
as.factor(hour)9 0.28286593 0.01137751 24.862 < 0.0000000000000002 ***
as.factor(hour)10 0.22494720 0.01126621 19.967 < 0.0000000000000002 ***
as.factor(hour)11 0.26742698 0.01128002 23.708 < 0.0000000000000002 ***
as.factor(hour)12 0.35504566 0.01114650 31.853 < 0.0000000000000002 ***
as.factor(hour)13 0.32677897 0.01096672 29.797 < 0.0000000000000002 ***
as.factor(hour)14 0.36286831 0.01096530 33.092 < 0.0000000000000002 ***
as.factor(hour)15 0.45463939 0.01135209 40.049 < 0.0000000000000002 ***
as.factor(hour)16 0.54290308 0.01120398 48.456 < 0.0000000000000002 ***
as.factor(hour)17 0.67039251 0.01129843 59.335 < 0.0000000000000002 ***
as.factor(hour)18 0.37614606 0.01143899 32.883 < 0.0000000000000002 ***
as.factor(hour)19 0.20110538 0.01143051 17.594 < 0.0000000000000002 ***
as.factor(hour)20 0.09719693 0.01151132 8.444 < 0.0000000000000002 ***
as.factor(hour)21 0.07855837 0.01147514 6.846 0.00000000000760801 ***
as.factor(hour)22 0.06896751 0.01137705 6.062 0.00000000134576430 ***
as.factor(hour)23 0.02218531 0.01144897 1.938 0.052654 .
dotw_simple2 0.02377572 0.00588873 4.037 0.00005403574372275 ***
dotw_simple3 0.00925967 0.00600759 1.541 0.123238
dotw_simple4 -0.03318396 0.00574706 -5.774 0.00000000774364607 ***
dotw_simple5 -0.03230693 0.00611195 -5.286 0.00000012518044383 ***
dotw_simple6 -0.03062853 0.00616800 -4.966 0.00000068477166340 ***
dotw_simple7 -0.05470911 0.00624000 -8.767 < 0.0000000000000002 ***
Temperature 0.00293576 0.00022137 13.261 < 0.0000000000000002 ***
Precipitation -0.83668132 0.07082242 -11.814 < 0.0000000000000002 ***
lag1Hour 0.28694920 0.00155823 184.151 < 0.0000000000000002 ***
lag3Hours 0.08684120 0.00152650 56.889 < 0.0000000000000002 ***
lag1day 0.17362425 0.00148343 117.043 < 0.0000000000000002 ***
lag1week 0.09151274 0.00137458 66.575 < 0.0000000000000002 ***
as.factor(month).L 0.06170679 0.00796574 7.747 0.00000000000000947 ***
as.factor(month).Q 0.01781405 0.00316482 5.629 0.00000001816291059 ***
week -0.01881709 0.00166046 -11.332 < 0.0000000000000002 ***
nearby_roads 0.00179729 0.00020358 8.828 < 0.0000000000000002 ***
nearby_bikelanes 0.00126797 0.00008765 14.466 < 0.0000000000000002 ***
nearby_stations 0.00452301 0.00015920 28.411 < 0.0000000000000002 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.9737 on 377754 degrees of freedom
Multiple R-squared: 0.3439, Adjusted R-squared: 0.3438
F-statistic: 4829 on 41 and 377754 DF, p-value: < 0.00000000000000022
test <- test %>%
mutate(
pred2 = predict(model2, newdata = test),
pred6 = predict(model6, newdata = test),
pred7 = predict(model7, newdata = test)
)
# Calculate MAE for each model
mae_results <- data.frame(
Model = c(
"2. + Temporal Lags",
"6. Model 2 + Demographics + Station FE + Rush Hour Interaction + Week Lag + Nearby Features",
"7. Model 2 + Week Lag + Nearby Features"
),
MAE = c(
mean(abs(test$Trip_Count - test$pred2), na.rm = TRUE),
mean(abs(test$Trip_Count - test$pred6), na.rm = TRUE),
mean(abs(test$Trip_Count - test$pred7), na.rm = TRUE)
)
)
kable(mae_results,
digits = 2,
caption = "Mean Absolute Error by Model (Test Set)",
col.names = c("Model", "MAE (trips)")) %>%
kable_styling(bootstrap_options = c("striped", "hover"))| Model | MAE (trips) |
|---|---|
| 2. + Temporal Lags | 0.40 |
| 6. Model 2 + Demographics + Station FE + Rush Hour Interaction + Week Lag + Nearby Features | 0.65 |
| 7. Model 2 + Week Lag + Nearby Features | 0.41 |
test <- test %>%
mutate(
error = Trip_Count - pred7,
abs_error = abs(error),
time_of_day = case_when(
hour < 7 ~ "Overnight",
hour >= 7 & hour < 10 ~ "AM Rush",
hour >= 10 & hour < 15 ~ "Mid-Day",
hour >= 15 & hour <= 18 ~ "PM Rush",
hour > 18 ~ "Evening"
)
)
# Scatter plot by time and day type
ggplot(test, aes(x = Trip_Count, y = pred7)) +
geom_point(alpha = 0.2, color = "#3182bd") +
geom_abline(slope = 1, intercept = 0, color = "red", linewidth = 1) +
geom_smooth(method = "lm", se = FALSE, color = "darkgreen") +
facet_grid(weekend ~ time_of_day) +
labs(
title = "Observed vs. Predicted Bike Trips",
subtitle = "Model 2 performance by time period",
x = "Observed Trips",
y = "Predicted Trips",
caption = "Red line = perfect predictions; Green line = actual model fit"
) +
plotTheme# Calculate MAE by station
station_errors <- test %>%
group_by(start_station, start_lat.x, start_lon.y) %>%
summarize(
MAE = mean(abs_error, na.rm = TRUE),
avg_demand = mean(Trip_Count, na.rm = TRUE),
.groups = "drop"
) %>%
filter(!is.na(start_lat.x), !is.na(start_lon.y))
## Create Two Maps Side-by-Side with Proper Legends (sorry these maps are ugly)
# Calculate station errors
station_errors <- test %>%
filter(!is.na(pred7)) %>%
group_by(start_station, start_lat.x, start_lon.y) %>%
summarize(
MAE = mean(abs(Trip_Count - pred6), na.rm = TRUE),
avg_demand = mean(Trip_Count, na.rm = TRUE),
.groups = "drop"
) %>%
filter(!is.na(start_lat.x), !is.na(start_lon.y))
# Map 1: Prediction Errors
p1.1 <- ggplot() +
geom_sf(data = philly_census, fill = "grey95", color = "white", size = 0.2) +
geom_point(
data = station_errors,
aes(x = start_lon.y, y = start_lat.x, color = MAE),
size = 3.5,
alpha = 0.7
) +
scale_color_viridis(
option = "plasma",
name = "MAE\n(trips)",
direction = -1,
breaks = c(0.5, 1.0, 1.5), # Fewer, cleaner breaks
labels = c("0.5", "1.0", "1.5")
) +
labs(title = "Prediction Errors ",
subtitle = "Model 7") +
mapTheme +
theme(
legend.position = "right",
legend.title = element_text(size = 10, face = "bold"),
legend.text = element_text(size = 9),
plot.title = element_text(size = 14, face = "bold"),
plot.subtitle = element_text(size = 10)
) +
guides(color = guide_colorbar(
barwidth = 1.5,
barheight = 12,
title.position = "top",
title.hjust = 0.5
))
p1 | p1.1**Model 7 MAEs are less severe in the areas just west and north of center city, however the patterns largely follow the same spatial distribution as model 2.
Poisson Model
p_model <- glm(
Trip_Count ~ as.factor(hour) + dotw_simple + Temperature + Precipitation +
lag1Hour + lag3Hours + lag1day + lag1week + as.factor(month) +week+
nearby_roads + nearby_bikelanes + nearby_stations,
data = train,
family = poisson(link = "log")
)
summary(p_model)
Call:
glm(formula = Trip_Count ~ as.factor(hour) + dotw_simple + Temperature +
Precipitation + lag1Hour + lag3Hours + lag1day + lag1week +
as.factor(month) + week + nearby_roads + nearby_bikelanes +
nearby_stations, family = poisson(link = "log"), data = train)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.3008142 0.1055903 -2.849 0.00439 **
as.factor(hour)1 -0.4532014 0.0312112 -14.520 < 0.0000000000000002 ***
as.factor(hour)2 -0.7459004 0.0339418 -21.976 < 0.0000000000000002 ***
as.factor(hour)3 -1.3348163 0.0427056 -31.256 < 0.0000000000000002 ***
as.factor(hour)4 -1.2040168 0.0400945 -30.029 < 0.0000000000000002 ***
as.factor(hour)5 -0.0591638 0.0279322 -2.118 0.03417 *
as.factor(hour)6 0.8693465 0.0229103 37.946 < 0.0000000000000002 ***
as.factor(hour)7 1.2721808 0.0217796 58.412 < 0.0000000000000002 ***
as.factor(hour)8 1.6166590 0.0208355 77.592 < 0.0000000000000002 ***
as.factor(hour)9 1.2273334 0.0214022 57.346 < 0.0000000000000002 ***
as.factor(hour)10 1.1005384 0.0216460 50.843 < 0.0000000000000002 ***
as.factor(hour)11 1.1760243 0.0215526 54.565 < 0.0000000000000002 ***
as.factor(hour)12 1.2803074 0.0211569 60.515 < 0.0000000000000002 ***
as.factor(hour)13 1.2686336 0.0210216 60.349 < 0.0000000000000002 ***
as.factor(hour)14 1.2881641 0.0208965 61.645 < 0.0000000000000002 ***
as.factor(hour)15 1.3849675 0.0209103 66.234 < 0.0000000000000002 ***
as.factor(hour)16 1.4503161 0.0206794 70.133 < 0.0000000000000002 ***
as.factor(hour)17 1.5076652 0.0205865 73.236 < 0.0000000000000002 ***
as.factor(hour)18 1.2402372 0.0209488 59.203 < 0.0000000000000002 ***
as.factor(hour)19 1.0421225 0.0214496 48.585 < 0.0000000000000002 ***
as.factor(hour)20 0.8459129 0.0220879 38.298 < 0.0000000000000002 ***
as.factor(hour)21 0.7262344 0.0227359 31.942 < 0.0000000000000002 ***
as.factor(hour)22 0.5944549 0.0233785 25.427 < 0.0000000000000002 ***
as.factor(hour)23 0.3319015 0.0249391 13.308 < 0.0000000000000002 ***
dotw_simple2 0.0465151 0.0073389 6.338 0.00000000023258043 ***
dotw_simple3 0.0192599 0.0076119 2.530 0.01140 *
dotw_simple4 -0.0706125 0.0076472 -9.234 < 0.0000000000000002 ***
dotw_simple5 -0.0686913 0.0080993 -8.481 < 0.0000000000000002 ***
dotw_simple6 -0.0645385 0.0082180 -7.853 0.00000000000000405 ***
dotw_simple7 -0.1279885 0.0084311 -15.181 < 0.0000000000000002 ***
Temperature 0.0068856 0.0002791 24.666 < 0.0000000000000002 ***
Precipitation -3.4025809 0.1945009 -17.494 < 0.0000000000000002 ***
lag1Hour 0.1370088 0.0010631 128.879 < 0.0000000000000002 ***
lag3Hours 0.0663740 0.0012058 55.045 < 0.0000000000000002 ***
lag1day 0.0846167 0.0011095 76.263 < 0.0000000000000002 ***
lag1week 0.0675999 0.0011719 57.683 < 0.0000000000000002 ***
as.factor(month).L 0.1681597 0.0109417 15.369 < 0.0000000000000002 ***
as.factor(month).Q -0.0239398 0.0045030 -5.316 0.00000010584953979 ***
week -0.0602140 0.0021050 -28.605 < 0.0000000000000002 ***
nearby_roads 0.0054790 0.0002687 20.392 < 0.0000000000000002 ***
nearby_bikelanes 0.0011617 0.0001093 10.626 < 0.0000000000000002 ***
nearby_stations 0.0158128 0.0002146 73.690 < 0.0000000000000002 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for poisson family taken to be 1)
Null deviance: 621223 on 377795 degrees of freedom
Residual deviance: 393742 on 377754 degrees of freedom
AIC: 685434
Number of Fisher Scoring iterations: 6
test <- test %>%
mutate(
p_pred = predict(p_model, newdata = test, type = "response")
)
# Calculate MAE for each model
mae_results <- data.frame(
Model = c(
"2. + Temporal Lags",
"6. Model 2 + Demographics + Station FE + Rush Hour Interaction + Week Lag + Nearby Features",
"7. Model 2 + Week Lag + Nearby Features",
"Poisson Model (Same Variables as Model 7)"
),
MAE = c(
mean(abs(test$Trip_Count - test$pred2), na.rm = TRUE),
mean(abs(test$Trip_Count - test$pred6), na.rm = TRUE),
mean(abs(test$Trip_Count - test$pred7), na.rm = TRUE),
mean(abs(test$Trip_Count - test$p_pred), na.rm = TRUE)
)
)
kable(mae_results,
digits = 2,
caption = "Mean Absolute Error by Model (Test Set)",
col.names = c("Model", "MAE (trips)")) %>%
kable_styling(bootstrap_options = c("striped", "hover"))| Model | MAE (trips) |
|---|---|
| 2. + Temporal Lags | 0.40 |
| 6. Model 2 + Demographics + Station FE + Rush Hour Interaction + Week Lag + Nearby Features | 0.65 |
| 7. Model 2 + Week Lag + Nearby Features | 0.41 |
| Poisson Model (Same Variables as Model 7) | 0.41 |
The Poisson Model does not improve MAE very much, however it does not result in any negative predictions, which inherently makes it a better method for predicting trip counts in this context.
Part 4: Critical Reflection
Overall, I would not recommend using any of the models produced in this study in order to predict real, exact bike station counts, however they could be useful rather in posing questions for further research into what factors affect bike station use the most, and how to make more accurate predictions at the extremes while not foregoing less busy stations. The models presented in this study all fell very short of the highest traffic station-times, and these are essential to get right in a usable model because identifying them will equate to predicting where the bulk of the bikes are at a given time in order to make redistribution more efficient. The factors that improved the model the most were time lags, meaning that the trips from a station at various times previous to the current time had a strong predictive effect, however it could be that the influence of these variables makes the model less adequate at predicting the extremes of trips. In other words, the time lags create a model that is more normalized over time, but the key to a useful model would be in predicting the more extreme trip counts that will have higher redistribution needs. The spatial distribution of prediction errors is not particularly egregious from a visual inspection of the map.
The biggest takeaway is that predicting bike share station usage by hour would benefit from additional complexity that is able to model the more extreme ends of bike share usage, perhaps using morning and evening rush as factor variables in addition to the hour of the day as a factor variable. Another aspect that might help in future iterations is accounting for holidays, again as a factor variable marking if the day is a holiday or not. Even if there is future improvement on the model, caution should be used on irregular public events or extreme weather events that will significantly skew bike trips in an unpredictable way.